This Rmarkdonwn document can be used to reproduce the panels in figure 3 of the manuscript: huva: human variation in gene expression as surrogate for gene function. To run this script without any problem of dependencies or conflicts in the installation we suggest to use the docker image we provide (see README file).

1 Figure 3 (Supp. Figures S5, S6, S7, S8, S9)

In this figure we evaluate the robustness of the huva methods. We will perform here a series of experiment to show the robustness and valitidy of our approach.

1.1 Loading required packages

library(huva)
library(huva.db)
library(ggplot2)
library(tidyverse)
library(reshape2)
library(limma)
library(fgsea)
library(Rmisc)
library(ggpubr)
library(pheatmap)
library(viridis)
library(foreach)
library(ggsci)

1.2 Costum Functions

source("./source/costum_functions.R")

1.3 Setting seed

set.seed(91)

1.4 Parallel computing

Part of the simulation executed here is calculated in parallel to lower the computational time, this is not necessary, the user can decide to activate or not this feature, changing the parameter ‘par_cal’ to TRUE

par_cal <- FALSE

if (par_cal == TRUE) {
 
  if (require(doMC) == F) {
  BiocManager::install("doMC", version = "3.12", update = F)
  }

  library(doMC)
  
  # Devine the number of workers for the parallel computing
  workers <- 4
   
}

1.5 Relevance of sample selection

We here wannt to compare the result of an huva experiment, where the comparison if between two groups characterized by low or high expression of a gene of interest, and a random selection of samples in the two experimental groups. We defined the run_huva_experiment_random_sample function which does not select the samples based on the expression of the gene of interest but only randomize the two experimental groups of equal size to the original huva groups. Here for example a randomized huva experiment was perfomed of CRELD1 giving almost no difference in the expression of the GOI (boxplot).

binned_random <- run_huva_experiment_random_sample(data = huva.db, 
                                                    gene = "CRELD1", 
                                                    quantiles = 0.10, 
                                                    gs_list = hallmarks_V7.2,
                                                    summ = T, 
                                                    datasets_list = c("FG500"), 
                                                    adjust.method = "BH")
## [1] "Binning on CRELD1 expression"
plot_binned_gene(goi = "CRELD1", huva_experiment = binned_random)$PBMC + theme_bw() +theme(aspect.ratio = 2) + 
  expand_limits(y = 7) + ggtitle("Randomized huva experiment")

1.5.1 Number of random permutation

select the number of permutation in the experiment, for simplicity I selected the same number of samples in the dataset

n.random <- round(dim(huva.db$FG500$data$PBMC)[2])

1.5.2 Running the experiment

I will now compare the result of the real huva experiment with the the result of the randomly selected samples. This same analysis will be performed with both a gene showing high and low variance across the dataset.

variance <- RowVar(huva.db$FG500$data$PBMC)
variance <- variance[order(variance, decreasing = T)]

1.5.2.1 Gene with low variance

# Container for the results
random_low <- matrix(nrow = n.random+1, ncol = 3)
colnames(random_low) <- c("logFC", "p.val", "n.DEgenes")

# The randomized experiment
for (i in 1:n.random) {
  binned_random <- run_huva_experiment_random_sample(data = huva.db, 
                                                    gene = "CRELD1", 
                                                    quantiles = 0.10, 
                                                    gs_list = hallmarks_V7.2,
                                                    summ = F, 
                                                    datasets_list = c("FG500"), 
                                                    adjust.method = "BH")

  logFC <- binned_random$FG500$DE_genes$PBMC["CRELD1",]$logFC
  p.val <- binned_random$FG500$DE_genes$PBMC["CRELD1",]$P.Value
  n.DEgenes <- sum(binned_random$FG500$DE_genes$PBMC$adj.P.Val <=0.05)
  
  random_low[i,] <- c(logFC, p.val, n.DEgenes) 
}

# The original experiment
binned_dataset <- run_huva_experiment(data = huva.db, 
                                      gene = "CRELD1", 
                                      quantiles = 0.10, 
                                      gs_list = hallmarks_V7.2,
                                      summ = F, 
                                      datasets_list = c("FG500"), 
                                      adjust.method = "BH")

logFC <- binned_dataset$FG500$DE_genes$FG500_PBMC["CRELD1",]$logFC
p.val <- binned_dataset$FG500$DE_genes$FG500_PBMC["CRELD1",]$P.Value
n.DEgenes <- sum(binned_dataset$FG500$DE_genes$FG500_PBMC$adj.P.Val <=0.05)

# Preparation of the table
random_low[n.random+1,] <- c(logFC, p.val, n.DEgenes)

random_low <- as.data.frame(random_low)

random_low$type <- "random"

random_low$type[dim(random_low)[1]] <- "real"

random_low$absFC <- abs(random_low$logFC)

random_low$neglogpval <- -log10(random_low$p.val)

1.5.3 Gene with high variance

# Container for the results
random_high <- matrix(nrow = n.random+1, ncol = 3)
colnames(random_high) <- c("logFC", "p.val", "n.DEgenes")

# The randomized experiment
for (i in 1:n.random) {
  binned_random <- run_huva_experiment_random_sample(data = huva.db, 
                                                    gene = "SLC12A1", 
                                                    quantiles = 0.10, 
                                                    gs_list = hallmarks_V7.2,
                                                    summ = F, 
                                                    datasets_list = c("FG500"), 
                                                    adjust.method = "BH")

  logFC <- binned_random$FG500$DE_genes$PBMC["SLC12A1",]$logFC
  p.val <- binned_random$FG500$DE_genes$PBMC["SLC12A1",]$P.Value
  n.DEgenes <- sum(binned_random$FG500$DE_genes$PBMC$adj.P.Val <=0.05)
  
  random_high[i,] <- c(logFC, p.val, n.DEgenes) 
}

# The original experiment
binned_dataset <- run_huva_experiment(data = huva.db, 
                                      gene = "SLC12A1", 
                                      quantiles = 0.10, 
                                      gs_list = hallmarks_V7.2,
                                      summ = T, 
                                      datasets_list = c("FG500"), 
                                      adjust.method = "BH")

logFC <- binned_dataset$FG500$DE_genes$FG500_PBMC["SLC12A1",]$logFC
p.val <- binned_dataset$FG500$DE_genes$FG500_PBMC["SLC12A1",]$P.Value
n.DEgenes <- sum(binned_dataset$FG500$DE_genes$FG500_PBMC$adj.P.Val <=0.05)

# Preparation of the table
random_high[n.random+1,] <- c(logFC, p.val, n.DEgenes)

random_high <- as.data.frame(random_high)

random_high$type <- "random"

random_high$type[dim(random_high)[1]] <- "real"

random_high$absFC <- abs(random_high$logFC)

random_high$neglogpval <- -log10(random_high$p.val)

# Merging the two experiments
ds_low <- melt(random_low)

ds_high <- melt(random_high)

ds_low$data_type <- "low_variance"

ds_high$data_type <- "high_variance"

ds_random_sampling <- rbind(ds_low, ds_high)

ds_random_sampling$type <- factor(ds_random_sampling$type, levels = c("real", "random"))

ds_random_sampling <- ds_random_sampling[!ds_random_sampling$variable %in% c("logFC", "p.val"),]

ds_random_sampling$experiment <- "sample_random"

ds_random_sampling$variable <- factor(ds_random_sampling$variable, 
                                      levels = c("absFC", "neglogpval", "n.DEgenes"), 
                                      labels = c("absolute FC", "-log10p value", "n. DE genes"))

rm(binned_dataset, binned_random, ds_high, ds_low, random_high, random_low, i, logFC, n.DEgenes, n.random, p.val, variance)

1.5.3.1 Plots

ggplot(ds_random_sampling[ds_random_sampling$data_type == "high_variance",], aes(x=experiment, y=value, fill=type)) +
  geom_point(shape=21, position = position_dodge2(.4), size=3) +
  facet_wrap(~variable, scales = "free") +
  theme_bw() + theme(aspect.ratio = 2) +
  scale_fill_manual(values = c("#BC3C29", "#01A087")) +
  ggtitle("Figure 3b and 3c")

ggplot(ds_random_sampling[ds_random_sampling$data_type == "low_variance",], aes(x=experiment, y=value, fill=type)) +
  geom_point(shape=21, position = position_dodge2(.4), size=3) +
  facet_wrap(~variable, scales = "free") +
  theme_bw() + theme(aspect.ratio = 2) +
  scale_fill_manual(values = c("#BC3C29", "#01A087")) +
  ggtitle("Figure S5a and S5b")

1.6 Overlap of DE genes across experiments

We now want to evaluate what is the overlap in the DE genes calculated with an huva experiment across 1593 genes of interest in the 500FG dataset.

1.6.1 Random selection of genes for the analysis

Random selection of the 1593 genes used for the experiment.

selection <- sample(rownames(huva.db$FG500$data$PBMC), 
                    round(dim(huva.db$FG500$data$PBMC)[1]*0.1), replace = F)

1.6.2 Prepatation of bias dataset

bias_ds <- huva.db$FG500$data$PBMC

bias_ds <- t(seq(1, 1.1, length.out = dim(bias_ds)[2])*t(bias_ds))

huva_default_bias <- huva.db

huva_default_bias$FG500$data[["PBMC"]] <- bias_ds

rm(bias_ds)

1.6.2.1 HM visualization of the bias

pheatmap(huva.db$FG500$data$PBMC[1:100,], 
         cluster_rows = F, cluster_cols = F, 
         scale = "row", 
         border_color = FALSE, 
         color = colorRampPalette(c("blue", "white", "red"))(100), 
         main = "Figure S6b - original dataset", 
         show_rownames = FALSE, 
         show_colnames = FALSE, 
         breaks = seq(-5,5, length.out = 100), 
         cellwidth = 4, cellheight = 3)

pheatmap(huva_default_bias$FG500$data$PBMC[1:100,], 
         cluster_rows = F, cluster_cols = F, 
         scale = "row", 
         border_color = FALSE, 
         color = colorRampPalette(c("blue", "white", "red"))(100), 
         main = "Figure S6b - bias dataset", 
         show_rownames = FALSE, 
         show_colnames = FALSE, 
         breaks = seq(-5,5, length.out = 100),
         cellwidth = 4, cellheight = 3)

1.6.2.2 Boxplot visualization of the bias

df_orignal <- melt(huva.db$FG500$data$PBMC)
df_orignal$data_type <- "original"
df_bias <- melt(huva_default_bias$FG500$data$PBMC)
df_bias$data_type <- "bias"

bias_vis_df <- rbind(df_orignal, df_bias)
bias_vis_df$data_type <- factor(bias_vis_df$data_type, levels = c("original", "bias"))

# sample onlx 10 of the sample of the dataset to display
set.seed(1)

ggplot(bias_vis_df[bias_vis_df$Var2 %in% sample(levels(bias_vis_df$Var2), size = 5),], aes(x=Var2, y=value, fill=data_type)) +
  geom_boxplot() + 
  scale_y_continuous(limits = c(0,20)) + 
  scale_fill_manual(values = c("#BC3C29", "#01A087")) + 
  theme_bw() +
  ggtitle("Figure S6c - random selection of 5 samples to show the bias") + theme(aspect.ratio = 2/5)

values <- seq(1, 1.1, length.out = 95)
names(values) <- colnames(huva_default_bias$FG500$data$PBMC)

print(paste("The bias on the samples",
            paste(c("HV078", "HV126", "HV259", "HV185", "HV435", "is:"), collapse = ", "),
            paste(values[c("HV078", "HV126", "HV259", "HV185", "HV435")], collapse = ", "), 
            sep = " "))
## [1] "The bias on the samples HV078, HV126, HV259, HV185, HV435, is: 1, 1.03510638297872, 1.04042553191489, 1.07127659574468, 1.09148936170213"
rm(df_bias, df_orignal, bias_vis_df, values)

1.6.3 Getting the DE genes overlap

Here we source the function to calculate the overlap in DE genes. First with the real 500FG datased and second with the biased dataset.

source("./source/DE_overlap_orig.R")
print("Value distribution of the original dataset")
## [1] "Value distribution of the original dataset"
summary(DE_comp_original$overlap)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.01427 0.21550 0.41394 1.00000
source("./source/DE_overlap_bias.R")
print("Value distribution of the biased dataset")
## [1] "Value distribution of the biased dataset"
summary(DE_comp_bias$overlap)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2446  0.5790  0.5148  0.7861  1.0000

1.6.3.1 Visualize the DE genes overlap

DE_comp_original$data_type <- "original"
DE_comp_bias$data_type <- "bias"
DE_comp_merge <- rbind(DE_comp_original, DE_comp_bias)

DE_comp_merge$data_type <- factor(DE_comp_merge$data_type, levels = c("original", "bias"))

ggplot(DE_comp_merge, aes(x=data_type, y=overlap, fill=data_type)) +
  geom_boxplot() +
  scale_fill_manual(values = c("#BC3C29", "#01A087")) + 
  theme_bw() + theme(aspect.ratio = 2) +
  xlab("")+
  ylab("DE genes overlap (fraction)") +
  ggtitle("Figure 3e - Overlap DE genes - boxplot")

t.test(DE_comp_merge[DE_comp_merge$data_type=="original",]$overlap, 
       DE_comp_merge[DE_comp_merge$data_type=="bias",]$overlap, paired = F)
## 
##  Welch Two Sample t-test
## 
## data:  DE_comp_merge[DE_comp_merge$data_type == "original", ]$overlap and DE_comp_merge[DE_comp_merge$data_type == "bias", ]$overlap
## t = -1129.8, df = 5019517, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2997777 -0.2987394
## sample estimates:
## mean of x mean of y 
## 0.2154973 0.5147559
rm(DE_comp_merge)
tmp <- DE_comp_original_matrix

for (i in 2:dim(tmp)[2]) {
  tmp[1:(i-1),i] <- NA
}

pheatmap(tmp, 
         scale = "none", 
         cluster_rows = F, 
         cluster_cols = F, 
         color = inferno(direction = -1, 50), 
         show_rownames = F, 
         show_colnames = F, 
         cellwidth = 0.15, 
         cellheight = 0.15, 
         border_color = NA, na_col = NA,
         breaks = seq(0, 1, length.out = 50),
         main = "Figure 3f - Original dataset")

rm(tmp)
tmp <- DE_comp_bias_matrix

for (i in 2:dim(tmp)[2]) {
  tmp[1:(i-1),i] <- NA
}

pheatmap(tmp, 
         scale = "none", 
         cluster_rows = F, 
         cluster_cols = F, 
         color = inferno(direction = -1, 50), 
         show_rownames = F, 
         show_colnames = F, 
         cellwidth = 0.15, 
         cellheight = 0.15, 
         border_color = NA, na_col = NA,
         breaks = seq(0, 1, length.out = 50),
         main = "Figure 3f - Bias dataset")

rm(tmp)

1.7 Correlation of the gene ranks

We now want to evaluate what is Pearson’s correlation between huva experiment across 1593 genes of interest in the 500FG dataset.

1.7.1 Getting the rank comparison

Here we source the function for the calculation of the pair-wise correlation coefficient. First for the original 500FG dataset and second for the biased dataset.

NOTE: This calculation required long time for the pair-wise comparisons, we provide the pre calculated tables to reproduce the plot in the manuscript and the original code.

# source("./source/Rank_overlap_orig.R")

rank_comp_origina <- readRDS("./data/rank_comp_orig.rds")
rank_comp_origina_matrix <- readRDS("./data/rank_comp_orig_mat.rds")
print("Value distribution of the original dataset")
## [1] "Value distribution of the original dataset"
summary(rank_comp_origina$overlap)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.988899 -0.462729  0.007376  0.004457  0.470226  0.992602
# source("./source/Rank_overlap_bias.R")

rank_comp_bias <- readRDS("./data/rank_comp_bias.rds")
rank_comp_bias_matrix <- readRDS("./data/rank_comp_bias_mat.rds")
print("Value distribution of the bias dataset")
## [1] "Value distribution of the bias dataset"
summary(rank_comp_bias$overlap)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.9432 -0.1168  0.2969  0.2286  0.6104  0.9948

1.7.1.1 Visualize the overlap

rank_comp_origina$data_type <- "original"
rank_comp_bias$data_type <- "bias"
rank_comp_merge <- rbind(rank_comp_origina, rank_comp_bias)

rank_comp_merge$data_type <- factor(rank_comp_merge$data_type, levels = c("original", "bias"))

ggplot(rank_comp_merge, aes(x=data_type, y=overlap, fill=data_type)) +
  geom_boxplot() +
  scale_fill_manual(values = c("#BC3C29", "#01A087")) + 
  theme_bw() + theme(aspect.ratio = 2) +
  xlab("")+
  ylab("DE genes overlap (fraction)") +
  ggtitle("Figure S8b - Overlap DE genes - boxplot")

t.test(rank_comp_merge[rank_comp_merge$data_type=="original",]$overlap, 
       rank_comp_merge[rank_comp_merge$data_type=="bias",]$overlap, paired = F)
## 
##  Welch Two Sample t-test
## 
## data:  rank_comp_merge[rank_comp_merge$data_type == "original", ]$overlap and rank_comp_merge[rank_comp_merge$data_type == "bias", ]$overlap
## t = -509.56, df = 4939704, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2249631 -0.2232392
## sample estimates:
##   mean of x   mean of y 
## 0.004457361 0.228558501
rm(rank_comp_merge)
tmp <- rank_comp_origina_matrix

for (i in 2:dim(tmp)[2]) {
  tmp[1:(i-1),i] <- NA
}

pheatmap(tmp, 
         scale = "none", 
         cluster_rows = F, 
         cluster_cols = F, 
         color = inferno(direction = -1, 50), 
         show_rownames = F, 
         show_colnames = F, 
         cellwidth = 0.15, 
         cellheight = 0.15, 
         breaks = seq(0, 1, length.out = 50), 
         border_color = NA, na_col = NA, 
         main = "Figure S8c - Original dataset")

rm(tmp)
tmp <- rank_comp_bias_matrix

for (i in 2:dim(tmp)[2]) {
  tmp[1:(i-1),i] <- NA
}

pheatmap(tmp, 
         scale = "none", 
         cluster_rows = F, 
         cluster_cols = F, 
         color = inferno(direction = -1, 50), 
         show_rownames = F, 
         show_colnames = F, 
         cellwidth = 0.15, 
         cellheight = 0.15, 
         breaks = seq(0, 1, length.out = 50), 
         border_color = NA, na_col = NA, 
         main = "Figure S8d - Bias dataset")

rm(tmp)

1.8 Overlap of the samples

We now want to evaluate in the selection of samples in the HIGH and LOW groups for an huva experiment across 1593 genes of interest in the 500FG dataset.

1.8.1 Calculating the sample overlap

Here we source the function for the calculation of the overlap. First for the original 500FG dataset and second for the biased dataset.

source("./source/sample_overlap_orig.R")
print("Value distribution of the original dataset - LOW group")
## [1] "Value distribution of the original dataset - LOW group"
summary(samples_comp_original_low$overlap)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.1000  0.1485  0.2000  1.0000
print("Value distribution of the original dataset - HIGH group")
## [1] "Value distribution of the original dataset - HIGH group"
summary(samples_comp_original_high$overlap)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.1000  0.1366  0.2000  1.0000
source("./source/sample_overlap_bias.R")
print("Value distribution of the bias dataset - LOW group")
## [1] "Value distribution of the bias dataset - LOW group"
summary(samples_comp_bias_low$overlap)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1000  0.2000  0.2336  0.3000  1.0000
print("Value distribution of the bias dataset - HIGH group")
## [1] "Value distribution of the bias dataset - HIGH group"
summary(samples_comp_bias_high$overlap)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1000  0.2000  0.2205  0.3000  1.0000

1.8.1.1 Visualize the overlap

samples_comp_original_low$data_type <- "original"
samples_comp_bias_low$data_type <- "bias"
samples_comp_merge_low <- rbind(samples_comp_original_low, samples_comp_bias_low)

samples_comp_merge_low$data_type <- factor(samples_comp_merge_low$data_type, levels = c("original", "bias"))

ggplot(samples_comp_merge_low, aes(x=data_type, y=overlap, fill=data_type)) +
  geom_boxplot() +
  scale_fill_manual(values = c("#BC3C29", "#01A087")) + 
  theme_bw() + theme(aspect.ratio = 2) +
  xlab("")+
  ylab("samples overlap (fraction)") +
  ggtitle("Figure S7b - Overlap Samples LOW group - boxplot")

t.test(samples_comp_merge_low[samples_comp_merge_low$data_type=="original",]$overlap, 
       samples_comp_merge_low[samples_comp_merge_low$data_type=="bias",]$overlap, paired = F)
## 
##  Welch Two Sample t-test
## 
## data:  samples_comp_merge_low[samples_comp_merge_low$data_type == "original", ]$overlap and samples_comp_merge_low[samples_comp_merge_low$data_type == "bias", ]$overlap
## t = -634.77, df = 5003559, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.08534185 -0.08481646
## sample estimates:
## mean of x mean of y 
## 0.1485132 0.2335924
rm(samples_comp_merge_low)
tmp <- samples_comp_original_matrix_low

for (i in 2:dim(tmp)[2]) {
  tmp[1:(i-1),i] <- NA
}

pheatmap(tmp, 
         scale = "none", 
         cluster_rows = F, 
         cluster_cols = F, 
         color = inferno(direction = -1, 50), 
         show_rownames = F, 
         show_colnames = F, 
         cellwidth = 0.15, 
         cellheight = 0.15, 
         border_color = NA, na_col = NA, 
         breaks = seq(0, 1, length.out = 50),  
         main = "Figure S7c - Original dataset")

rm(tmp)
tmp <- samples_comp_bias_matrix_low

for (i in 2:dim(tmp)[2]) {
  tmp[1:(i-1),i] <- NA
}

pheatmap(tmp, 
         scale = "none", 
         cluster_rows = F, 
         cluster_cols = F, 
         color = inferno(direction = -1, 50), 
         show_rownames = F, 
         show_colnames = F, 
         cellwidth = 0.15, 
         cellheight = 0.15, 
         na_col = NA, border_color = NA, 
         breaks = seq(0, 1, length.out = 50),  
         main = "Figure S7c - Original dataset")

rm(tmp)
samples_comp_original_high$data_type <- "original"
samples_comp_bias_high$data_type <- "bias"
samples_comp_merge_high <- rbind(samples_comp_original_high, samples_comp_bias_high)

samples_comp_merge_high$data_type <- factor(samples_comp_merge_high$data_type, levels = c("original", "bias"))

ggplot(samples_comp_merge_high, aes(x=data_type, y=overlap, fill=data_type)) +
  geom_boxplot() +
  scale_fill_manual(values = c("#BC3C29", "#01A087")) + 
  theme_bw() + theme(aspect.ratio = 2) +
  xlab("")+
  ylab("samples overlap (fraction)") +
  ggtitle("Figure S7b - Overlap Samples HIGH group - boxplot")

t.test(samples_comp_merge_high[samples_comp_merge_high$data_type=="original",]$overlap, 
       samples_comp_merge_high[samples_comp_merge_high$data_type=="bias",]$overlap, paired = F)
## 
##  Welch Two Sample t-test
## 
## data:  samples_comp_merge_high[samples_comp_merge_high$data_type == "original", ]$overlap and samples_comp_merge_high[samples_comp_merge_high$data_type == "bias", ]$overlap
## t = -638.52, df = 5050152, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.08414215 -0.08362718
## sample estimates:
## mean of x mean of y 
## 0.1365698 0.2204544
rm(samples_comp_merge_high)
tmp <- samples_comp_original_matrix_high

for (i in 2:dim(tmp)[2]) {
  tmp[1:(i-1),i] <- NA
}

pheatmap(tmp, 
         scale = "none", 
         cluster_rows = F, 
         cluster_cols = F, 
         color = inferno(direction = -1, 50), 
         show_rownames = F, 
         show_colnames = F, 
         cellwidth = 0.15, 
         cellheight = 0.15, 
         border_color = NA, na_col = NA, 
         breaks = seq(0, 1, length.out = 50),  
         main = "Figure S7e - Original dataset HIGH group")

rm(tmp)
tmp <- samples_comp_bias_matrix_high

for (i in 2:dim(tmp)[2]) {
  tmp[1:(i-1),i] <- NA
}

pheatmap(tmp, 
         scale = "none", 
         cluster_rows = F, 
         cluster_cols = F, 
         color = inferno(direction = -1, 50), 
         show_rownames = F, 
         show_colnames = F, 
         cellwidth = 0.15, 
         cellheight = 0.15, 
         border_color = NA, na_col = NA, 
         breaks = seq(0, 1, length.out = 50),  
         main = "Figure S7e - Bias dataset HIGH group")

rm(tmp)

1.8.1.2 Random selection of 10 samples

We compare here the result of the overlap of samples when randomly selecting samples

source("./source/sample_overlap_random.R")
summary(sample_comp_random$overlap)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.1000  0.1053  0.2000  0.7000
ggplot(sample_comp_random, aes(x="random", y=overlap)) +
  geom_boxplot(fill = "gray") +
  xlab("")+
  ylab("samples overlap (fraction)") +
  theme_bw() + theme(aspect.ratio = 4) + ggtitle("Randoms Sampling Overlap - data not shown")

1.9 Randomization of the signatures

1.9.1 MYD88

set.seed(91) 

n.random.sign <- 1000

core <- huva.db::msigdb_V7.2$GSE22935_WT_VS_MYD88_KO_MACROPHAGE_UP

random_sign <- list()

for (i in 1:n.random.sign) {
  sign <- sample(rownames(huva.db$FG500$data$PBMC), 186, replace = F)
  random_sign[[paste("random nr. ", i, sep = "")]] <- sign
}
random_sign[["core"]] <- core

binned_dataset <- run_huva_experiment(data = huva.db, 
                                      gene = "MYD88", 
                                      quantiles = 0.10, 
                                      gs_list = random_sign,
                                      summ = F, 
                                      datasets_list = c("FG500"), 
                                      adjust.method = "BH")
## [1] "Binning on MYD88 expression"
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- c(rep("random", n.random.sign), "real")

binned_dataset$FG500$gsea$FG500_PBMC$data_type <- factor(binned_dataset$FG500$gsea$FG500_PBMC$data_type, levels = c("real", "random"))

rm(i, core, n.random.sign, sign, random_sign)

1.9.1.1 Plot

ggplot(binned_dataset$FG500$gsea$FG500_PBMC, aes(x=NES, y=-log10(pval), size=-log10(pval), fill=data_type))+
  geom_point(shape=21) +
  theme_bw() + theme(aspect.ratio = 2) +
  scale_x_continuous(limits = c(-3, 3)) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  geom_vline(xintercept = 0, linetype = "dashed") +
  scale_fill_manual(values = c("#BC3C29", "#01A087")) +
  ggtitle("Figure 3h - Random gene signatures - MYD88") +
  scale_y_continuous(limits = c(0,3))

rm(binned_dataset)

1.9.2 AKT

set.seed(91) 

n.random.sign <- 1000

core <- huva.db::msigdb_V7.2$AKT_UP.V1_UP

random_sign <- list()

for (i in 1:n.random.sign) {
  sign <- sample(rownames(huva.db$FG500$data$PBMC), 138, replace = F)
  random_sign[[paste("random nr. ", i, sep = "")]] <- sign
}
random_sign[["core"]] <- core

binned_dataset <- run_huva_experiment(data = huva.db, 
                                      gene = "AKT1", 
                                      quantiles = 0.10, 
                                      gs_list = random_sign,
                                      summ = F, 
                                      datasets_list = c("FG500"), 
                                      adjust.method = "BH")
## [1] "Binning on AKT1 expression"
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- c(rep("random", n.random.sign), "real")

binned_dataset$FG500$gsea$FG500_PBMC$data_type <- factor(binned_dataset$FG500$gsea$FG500_PBMC$data_type, levels = c("real", "random"))

rm(i, core, n.random.sign, sign, random_sign)
1.9.2.0.1 Plot
ggplot(binned_dataset$FG500$gsea$FG500_PBMC, aes(x=NES, y=-log10(pval), size=-log10(pval), fill=data_type))+
  geom_point(shape=21) +
  theme_bw() + theme(aspect.ratio = 2) +
  scale_x_continuous(limits = c(-3, 3)) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  geom_vline(xintercept = 0, linetype = "dashed") +
  scale_fill_manual(values = c("#BC3C29", "#01A087")) +
  ggtitle("Figure S9a - Random gene signatures - AKT") +
  scale_y_continuous(limits = c(0,3))

rm(binned_dataset)

1.9.3 ERK

set.seed(91) 

n.random.sign <- 1000

core <- huva.db::msigdb_V7.2$GO_ERK1_AND_ERK2_CASCADE

random_sign <- list()

for (i in 1:n.random.sign) {
  sign <- sample(rownames(huva.db$FG500$data$PBMC), 219, replace = F)
  random_sign[[paste("random nr. ", i, sep = "")]] <- sign
}
random_sign[["core"]] <- core

binned_dataset <- run_huva_experiment(data = huva.db, 
                                      gene = "MAPK3", 
                                      quantiles = 0.10, 
                                      gs_list = random_sign,
                                      summ = F, 
                                      datasets_list = c("FG500"), 
                                      adjust.method = "BH")
## [1] "Binning on MAPK3 expression"
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- c(rep("random", n.random.sign), "real")

binned_dataset$FG500$gsea$FG500_PBMC$data_type <- factor(binned_dataset$FG500$gsea$FG500_PBMC$data_type, levels = c("real", "random"))

rm(i, core, n.random.sign, sign, random_sign)

1.9.3.1 Plot

ggplot(binned_dataset$FG500$gsea$FG500_PBMC, aes(x=NES, y=-log10(pval), size=-log10(pval), fill=data_type))+
  geom_point(shape=21) +
  theme_bw() + theme(aspect.ratio = 2) +
  scale_x_continuous(limits = c(-3, 3)) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  geom_vline(xintercept = 0, linetype = "dashed") +
  scale_fill_manual(values = c("#BC3C29", "#01A087")) +
  ggtitle("Figure S9b - Random gene signatures - ERK") +
  scale_y_continuous(limits = c(0,3)) 

rm(binned_dataset)

1.9.4 STAT1

set.seed(91) 

n.random.sign <- 1000

core <- huva.db::msigdb_V7.2$GSE40666_WT_VS_STAT1_KO_CD8_TCELL_UP

random_sign <- list()

for (i in 1:n.random.sign) {
  sign <- sample(rownames(huva.db$FG500$data$PBMC), 195, replace = F)
  random_sign[[paste("random nr. ", i, sep = "")]] <- sign
}
random_sign[["core"]] <- core

binned_dataset <- run_huva_experiment(data = huva.db, 
                                      gene = "STAT1", 
                                      quantiles = 0.10, 
                                      gs_list = random_sign,
                                      summ = F, 
                                      datasets_list = c("FG500"), 
                                      adjust.method = "BH")
## [1] "Binning on STAT1 expression"
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- c(rep("random", n.random.sign), "real")

binned_dataset$FG500$gsea$FG500_PBMC$data_type <- factor(binned_dataset$FG500$gsea$FG500_PBMC$data_type, levels = c("real", "random"))

rm(i, core, n.random.sign, sign, random_sign)

1.9.4.1 Plot

ggplot(binned_dataset$FG500$gsea$FG500_PBMC, aes(x=NES, y=-log10(pval), size=-log10(pval), fill=data_type))+
  geom_point(shape=21) +
  theme_bw() + theme(aspect.ratio = 2) +
  scale_x_continuous(limits = c(-3, 3)) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  geom_vline(xintercept = 0, linetype = "dashed") +
  scale_fill_manual(values = c("#BC3C29", "#01A087")) +
  ggtitle("Figure S9c - Random gene signatures - STAT1") +
  scale_y_continuous(limits = c(0,3))

rm(binned_dataset)

1.10 Variation of the binning values

1.10.1 Calculation of the experiment

set.seed(91)

MyD88_quant <- optimal_quant(gene_name = "MYD88", FC.Cut = 0)
## Warning: Zero sample variances detected, have been offset away from zero

1.10.1.1 Plot

MyD88_quant$plot_FC +
  ggtitle("Fig. S9e - Quantile selection log2 FC") +
  xlab("quantile") +
  ylab("log2 fold change") + 
  scale_fill_manual(values = c("#BC3C29", "#01A087"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

MyD88_quant$plot_adj.P.val +
  ggtitle("Fig. S9f - Quantile selection -log10 p value") +
  xlab("quantile") +
  ylab("-log10 p value") + 
  scale_fill_manual(values = c("#BC3C29", "#01A087"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

MyD88_quant$plot_result + 
  scale_x_continuous(limits = c(0.085, 0.4)) +
  scale_y_continuous(limits = c(0.3, 0.9), breaks = c(0.3, 0.45, 0.6, 0.75, 0.9)) + 
  geom_smooth(se = FALSE) +
  ylab("-log10 p value") +
  xlab("log2 fold change") +
  ggtitle("Fig. S9g - Quantile selection")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(MyD88_quant$result, aes(x=logFC, y=n.DE)) +
  geom_point(shape = 21, size = 3) + theme_bw() +
  geom_smooth(se = FALSE) + theme(aspect.ratio = 1/5) +
  scale_x_continuous(limits = c(0.085, 0.4)) + 
  scale_y_continuous(limits = c(0, 8000)) +
  ggtitle("Fig. S9g - Quantile selection - Top") +
  xlab("log2 fold change")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(MyD88_quant$result, aes(x=-log10(adj.P.val), y=n.DE)) +
  geom_point(shape = 21, size = 3) + theme_bw() +
  geom_smooth(se = FALSE) + theme(aspect.ratio = 1/5) +
  scale_x_continuous(limits = c(0.3, 0.9), breaks = c(0.3, 0.45, 0.6, 0.75, 0.9)) + 
  scale_y_continuous(limits = c(0, 8000)) +
  ggtitle("Fig. S9g - Quantile selection - Side") +
  xlab("-log10 p value")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

### Similarities in the gene ranks

source("./source/Rank_overlap_quant.R")
summary(quantile_overlap_MyD88$correlation)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.8274  0.9361  0.9654  0.9584  0.9850  1.0000
ggplot(quantile_overlap_MyD88, aes(x=experiment, y=correlation)) +
  geom_boxplot(outlier.colour = "black", fill = "#C2C2C2") +
  geom_point(position = position_dodge2(0.4), shape=21, alpha=0.0005) +
  theme_bw() + theme(aspect.ratio = 4) + 
  ggtitle("Fig. S9h - Pearson's correlation") + 
  scale_y_continuous(limits = c(0,1)) +
  xlab("") +
  ylab("rank-rank Pearson's correlation (r)")

tmp <- quantile_overlap_MyD88_matrix

for (i in 2:dim(tmp)[2]) {
  tmp[1:(i-1),i] <- NA
}

pheatmap(tmp, 
         scale = "none", 
         cluster_rows = F, 
         cluster_cols = F, 
         color = inferno(direction = -1, 50), 
         show_rownames = F, show_colnames = F, 
         cellwidth = 4, cellheight = 4, 
         breaks = seq(0.7,1,length.out = 50), 
         border_color = NA, na_col = NA, 
         main = "Fig. S9h -Pearson's correlation")

rm(tmp, i)

2 Session info

info <- sessionInfo()
info
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggsci_2.9           foreach_1.5.1       viridis_0.5.1      
##  [4] viridisLite_0.3.0   pheatmap_1.0.12     ggpubr_0.4.0       
##  [7] Rmisc_1.5           plyr_1.8.6          lattice_0.20-41    
## [10] fgsea_1.12.0        Rcpp_1.0.5          limma_3.46.0       
## [13] reshape2_1.4.4      forcats_0.5.0       stringr_1.4.0      
## [16] dplyr_1.0.2         purrr_0.3.4         readr_1.4.0        
## [19] tidyr_1.1.2         tibble_3.0.4        tidyverse_1.3.0    
## [22] ggplot2_3.3.2       huva.db_0.1.4-2     huva_0.1.4         
## [25] BiocManager_1.30.10
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-0            ggsignif_0.6.0             
##   [3] ellipsis_0.3.1              rio_0.5.16                 
##   [5] XVector_0.30.0              fs_1.5.0                   
##   [7] GenomicRanges_1.42.0        rstudioapi_0.13            
##   [9] farver_2.0.3                bit64_4.0.5                
##  [11] fansi_0.4.1                 AnnotationDbi_1.52.0       
##  [13] lubridate_1.7.9.2           xml2_1.3.2                 
##  [15] splines_4.0.3               codetools_0.2-18           
##  [17] knitr_1.30                  jsonlite_1.7.2             
##  [19] broom_0.7.3                 annotate_1.68.0            
##  [21] dbplyr_2.0.0                graph_1.68.0               
##  [23] compiler_4.0.3              httr_1.4.2                 
##  [25] backports_1.2.1             assertthat_0.2.1           
##  [27] Matrix_1.3-0                lazyeval_0.2.2             
##  [29] cli_2.2.0                   htmltools_0.5.0            
##  [31] tools_4.0.3                 gtable_0.3.0               
##  [33] glue_1.4.2                  GenomeInfoDbData_1.2.4     
##  [35] fastmatch_1.1-0             carData_3.0-4              
##  [37] Biobase_2.50.0              cellranger_1.1.0           
##  [39] vctrs_0.3.6                 nlme_3.1-151               
##  [41] iterators_1.0.13            xfun_0.19                  
##  [43] rvest_0.3.6                 openxlsx_4.2.3             
##  [45] lifecycle_0.2.0             rstatix_0.6.0              
##  [47] XML_3.99-0.5                zlibbioc_1.36.0            
##  [49] scales_1.1.1                hms_0.5.3                  
##  [51] MatrixGenerics_1.2.0        parallel_4.0.3             
##  [53] SummarizedExperiment_1.20.0 RColorBrewer_1.1-2         
##  [55] yaml_2.2.1                  curl_4.3                   
##  [57] memoise_1.1.0               gridExtra_2.3              
##  [59] stringi_1.5.3               RSQLite_2.2.1              
##  [61] GSVA_1.38.0                 S4Vectors_0.28.1           
##  [63] BiocGenerics_0.36.0         zip_2.1.1                  
##  [65] BiocParallel_1.24.1         GenomeInfoDb_1.26.2        
##  [67] rlang_0.4.9                 pkgconfig_2.0.3            
##  [69] matrixStats_0.57.0          bitops_1.0-6               
##  [71] evaluate_0.14               labeling_0.4.2             
##  [73] htmlwidgets_1.5.3           bit_4.0.4                  
##  [75] tidyselect_1.1.0            GSEABase_1.52.1            
##  [77] magrittr_2.0.1              R6_2.5.0                   
##  [79] IRanges_2.24.1              generics_0.1.0             
##  [81] DelayedArray_0.16.0         DBI_1.1.0                  
##  [83] mgcv_1.8-33                 pillar_1.4.7               
##  [85] haven_2.3.1                 foreign_0.8-81             
##  [87] withr_2.3.0                 abind_1.4-5                
##  [89] RCurl_1.98-1.2              modelr_0.1.8               
##  [91] crayon_1.3.4                car_3.0-10                 
##  [93] plotly_4.9.2.2              rmarkdown_2.6              
##  [95] grid_4.0.3                  readxl_1.3.1               
##  [97] data.table_1.13.4           blob_1.2.1                 
##  [99] reprex_0.3.0                digest_0.6.27              
## [101] xtable_1.8-4                stats4_4.0.3               
## [103] munsell_0.5.0

3 Save environment

save.image(paste("./data/",Sys.Date(), "_huva_Figure_3.RData", sep = ""))